## Setup

# clear workspace
rm(list = ls()); gc()

# load packages
library(stm)
library(tm)
library(tidyverse)
library(stringi)

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# ~~~~~~~~~~


# This script takes about 1.75 hours to run


# ~~~~~~~~~~


## Store keywords for removal

# broadcast transcript-specific words and metadata
stopwords <- c("that is all in for this evening", "starts right now", "all rights reserved", "allrightsreserved", "special edition", "msnbc", "languages", 
               "fox news", "fox", "channel", "all regions", "all industries", "all subjects", "all companies", "all authors", "factiva inc", "factiva", 
               "news", "domestic", "english", "content and programming", "copyright", "asc services ii media  llc", "asc services", "media llc", "llc", 
               "services ii media", "ii media", "services ii", "distributed", "transmitted", "displayed", "without the prior written permission", "asc",
               "materials herein are protected by united states", "law and may not be reproduced", "published", "broadcast", "page", "tonight", "begin video clip", 
               "you may not alter", "or remove any trademark", "or other notice from copies", "content", "starts right now", "unidentified male", 
               "unidentified female", "end video clip", "crosstalk", "commercial break", "search summary", "all of these words", "at least one of these",  
               "none of these words", "this exact phrase", "date", "source", "author", "company", "industry", "subject", "region", "language", "timestamp", 
               "results found", "host", "words", "good evening", "end", "rush transcript", "copy may not be in its final form and may be updated", 
               "dow jones", "january", "february", "april", "may", "june", "july", "august", "september", "october", "november", "december", "jan", "feb", 
               "thanks for being with us", "thank you for joining us", "thanks for joining us", "thanks for having me", "thank you for having me", "stay with us", 
               "well be right back", "thank you for being here", "thank you so much for being here", "pageoffactiva", "incallrightsreserved\f", 
               "document chayesega", "inaudible", "question", "welcome back", "joining me now", "joining me", "right after the break", "after the break", 
               "ill be right back", "ifs", "up next", "picks up our news coverage", "my colleague", "rm", "laughter", "great show as always", "great show", 
               "interview", "oneonone", "video playing", "abc news", "abc", "graphics", "developing stories", "announcer", "coming up", "we are back tonight", 
               "breaking news", "reporter", "thaf", "whaf", "thafs", "whafs")

# remove common stop words
common <- c("the", "and", "of", "a", "to", "in", "was", "it", "he", "that", "i", "she", "had", "his", "they", "but", "as", "her", "with", "for", "is", 
            "on", "said", "you", "not", "were", "so", "all", "be", "at", "one", "there", "him", "from", "have", "then", "which", "them", "this", 
            "out", "could", "when", "into", "now", "who", "my", "by", "their", "we", "will", "like", "are", "what", "if", "me", "would", "very", 
            "no", "been", "about", "where", "an", "how", "only", "came", "or", "do", "here", "its", "did", "can", "through", "must", "has", "than", 
            "too", "come", "our", "go", "upon", "your", "hes", "shes", "thats", "get", "gets", "im", "us")

# USA Today-specific names/titles
usa <- c("usa today", "karen weintraub", "weintraub", "karen", "adrianna rodriguez", "rodriguez", "david heath", "adrianna", "elizabeth weise", "weise", 
         "elizabeth", "ken alltucker", "alltucker", "susan page", "susan miller", "david jesse", "page", "susan", "grace hauck", "hauck", "grace", "john bacon", 
         "john fritze", "fritze", "bacon", "john", "david jackson", "david oliver", "jackson", "david", "kevin johnson", "kevin mccoy", "johnson", "kevin", 
         "kristine phillips", "phillips", "kristine", "jorge l. ortiz", "jorge l ortiz", "ortiz", "jorge", "richard wolf", "wolf", "richard", "bart jansen", 
         "jansen", "bart", "nicholas wu", "wu", "nicholas", "erin richards", "erin mansfield", "richards", "erin", "trevor hughes", "hughes", "trevor", 
         "joey garrison", "garrison", "joey", "courtney subramanian", "subramanian", "courtney", "michael collins", "michael", "collins", "patrick ryan",
         "christal hayes", "hayes", "christal", "rebecca morin", "morin", "rebecca", "ryan w. miller", "ryan w miller", "ryan miller", "miller", "ryan", 
         "chris quintana", "quintana", "chris woodyard", "chris", "tom vanden brook", "vanden", "brook", "tom", "alia e. dastagir", "dastagir", "alia", 
         "jessica bliss", "lindsay schnell", "schnell", "lindsay", "mccoy", "rick jervis", "jervis", "rick", "jessica guynn", "deborah barfield berry", 
         "barfield", "deborah", "jessica menton", "menton", "jessica", "william cummings", "cummings", "william", "kim hjelmgaard", "hjelmgaard", "kim", 
         "maureen groppe", "groppe", "maureen", "deirdre shesgreen", "deirdre", "shesgreen", "terry demio", "nick penzenstadler", "penzenstadler", "nick", 
         "nicquel terry ellis", "ellis", "terry", "nicquel", "dennis wagner", "wagner", "dennis", "charisse jones", "jones", "charisse", "marisa kwiatkowski", 
         "kwiatkowski", "marisa", "oliver", "woodyard", "coral murphy marcos", "dinah voyles pulver", "voyles", "pulver", "dinah", "brett murphy", "murphy", 
         "brett", "elinor aspegren", "aspegren", "elinor", "doyle rice", "doyle", "donovan slack", "paul egan", "donovan", "paul davidson", "davidson", 
         "paul", "nathan bomey", "bomey", "nathan", "brad heath", "morgan hines", "hines", "morgan", "alan gomez", "gomez", "alan", "rachel axon", "axon", 
         "kelly tyko", "rachel", "tessa duvall", "duvall", "tessa", "brent schrotenboer", "schrotenboer", "brent", "mansfield", "cara kelly", "kelly", "cara", 
         "maria puente", "puente", "maria", "jeanine santucci", "mark emmert", "santucci", "jeanine", "joel shannon", "shannon", "joel", "kyle bagenstose", 
         "bagenstose", "kyle", "josh salman", "dawn gilbertson", "gilbertson", "christine brennan", "brennan", "christine", "nancy armour", "nancy", 
         "armour", "mark nichols", "nichols", "joshua bote", "bote", "joshua", "josh peter", "josh", "peter", "tyko", "jordan culver", "culver", "jordan", 
         "lorenzo reyes", "lorenzo", "reyes", "patrick", "jesse yomtov", "gabe lacques", "lacques", "gabe", "jesse", "guynn", "nicole carroll", "carroll", "nicole", 
         "heath", "doug stanglin", "stanglin", "doug", "matthew brown", "matthew", "dalvin brown", "dalvin", "marcos", "coral", "matt leclercq", "leclercq", 
         "wyatte grantham-philips", "wyatte", "grantham", "philips", "eric j. lyman", "lyman", "eric", "egan", "tresa baldas", "baldas", "tresa", "andrew wolfson", 
         "andrew", "wolfson", "dustin racioppi", "racioppi", "dustin", "jayme deerwester", "deerwester", "jayme", "eileen rivers", "eileen", "dave boucher", 
         "boucher", "dave", "demio", "salman", "yomtov", "curtis tate", "tate", "curtis", "darcy costello", "darcy", "costello", "emmert", "joseph spector", 
         "joseph", "jon campbell", "campbell", "jon", "brad", "usat", "issn", "georgie silvarole", "silvarole", "georgie", "contributing", "associated press")

# ABC-specific names/titles
abc <- c("wnt", "david muir", "david kerley", "kerley", "muir", "david", "american broadcasting companies", "whit johnson", "johnson", "whit", "ginger zee", 
         "zee", "adrienne bankert", "bankert", "adrienne", "maggie rulli", "rulli", "maggie", "jonathan karl", "jon karl", "karl", "jonathan", "martha raddatz", 
         "raddatz", "martha", "ian pannell", "pannell", "ian", "kyra phillips", "phillips", "kyra", "matt gutman", "gutman", "matt", "linsey davis", "davis", 
         "linsey", "stephanie ramos", "ramos", "stephanie", "will carr", "carr", "clayton sandell", "sandell", "clayton", "steve osunsami", "osunsami", "steve", 
         "eva pilgrim", "eva", "michael strahan", "strahan", "michael", "mary bruce", "bruce", "mary", "robin roberts", "deborah roberts", "deborah", "roberts", 
         "victor oquendo", "oquendo", "rob marciano", "marciano", "stephen ganyard", "ganyard", "stephen", "tom llamas", "tom", "pierre thomas", "thomas", "pierre", 
         "rachel scott", "scott", "rachel", "james longman", "longmna", "james", "erielle reshef", "reshef", "erielle", "jennifer ashton", "jen ashton",  "ashton", 
         "jennifer", "rebecca jarvis", "jarvis", "rebecca", "cecilia vega", "vega", "cecilia", "kaylee hartung", "hartung", "kaylee", "terry moran", "moran", 
         "terry", "diane sawyer", "sawyer", "diane", "juju chang", "chang", "juju", "george stephanopoulos", "stephanopoulos", "george", "kayna whitworth", 
         "whitworth", "kayna", "marcus moore", "moore", "marcus", "alex perez", "alex presha", "presha", "perez", "alex", "will reeve", "reeve", "gio benitez", 
         "benitez", "gio", "robert jobson", "jobson", "robert", "zachary kiesch", "kiesch", "zachary", "john nance", "john guinones", "quinones", "nance", "john", 
         "bob woodruff", "woodruff", "bob", "jordyn phelps", "phelps", "jordyn", "dan abrams", "abrams", "dan", "diane macedo", "macedo", "diane", "trevor ault", 
         "ault", "trevor", "zohreen shah", "shah", "zohreen", "janai norman", "janai", "norman", "amy robach", "robach", "amy", "dan harris", "harris", "dan", 
         "zach rael", "rael", "zach", "caroline reinwald", "reinwald", "caroline", "kate thornton", "thornton", "kate", "the index", "american strong", 
         "person of the week", "americastrong", "persons of the week")

# MSNBC-specific names/titles
msnbc <- c("rachel maddow show", "rachel maddow", "maddow", "rachel", "hayes brown", "all in with chris hayes", "chayes", "chris hayes", "hayes", "chris", 
           "hardball", "chris matthews", "matthews", "joyce vance", "vance", "brian williams", "williams", "brian", "steve kornacki", "kornacki", "steve", 
           "nicolle wallace", "wallace", "nicolle", "al sharpton", "sharpton", "alicia menendez", "menendez", "alicia", "chris jansing", "jansing", "maya wiley", 
           "wiley", "maya", "stephanie ruhle", "ruhle", "stephanie", "alex seitzwald", "seitzwald", "shaquille brewster", "shaquille", "brewster", "ari melber", 
           "melber", "ari", "angela rasmussen", "rasmussen", "angela", "sam seder", "seder", "joy reid", "reid", "sam stein", "stein", "michelle goldberg", 
           "goldberg", "michelle", "garrett haake", "haake", "garrett", "craig melvin", "melvin", "craig", "natasha bertrand", "bertrand", "natasha", 
           "brandy zadrozny", "zadrozny", "brandy", "cynthia alksne", "alksne", "cynthia", "schmidt", "laurie garrett", "laurie", "ali velshi", "velshi", 
           "ali", "zerlina maxwell", "maxwell", "zerlina", "geoff bennett", "bennett", "geoff", "paola ramos", "ramos", "paola", "brittany packnett cunningham", 
           "packnett", "cunningham", "brittany", "ayman mohyeldin", "mohyeldin", "ayman", "leigh ann caldwell", "caldwell", "leigh", "ann", "ben collins", "collins", 
           "ben", "matt miller", "miller", "barbara mcquade", "mcquade", "barbara", "trymaine lee", "trymaine", "shannon pettypiece", "shannon", "pettypiece", 
           "jacob soboroff", "soboroff", "jacob", "mara gay", "pete williams", "neal katyal", "katyal", "neal", "andrea mitchell", "andrea", "mitchell", 
           "edward foley", "edward", "foley", "mehdi hasan", "hasan", "mehdi", "jonathan alter", "jonathan capehart", "capehart", "jonathan", "benjy sarlin", 
           "sarlin", "benjy", "sahil kapur", "kapur", "sahil", "nick confessore", "confessore", "nick", "carol lee", "carol", "glenn kirschner", "kirschner", 
           "glenn", "cal perry", "perry", "allen", "jose diazbalart", "diazbalart", "jose", "claire mccaskill", "mccaskill", "claire", "rhodes", "chaves", 
           "kevin tibbles", "tibbles", "kevin", "michael steele", "steele", "michael", "priscilla thompson", "priscilla", "thompson", "nahid bhadelia", "nahid", 
           "bhadelia", "rev", "richard lui", "lui", "richard", "hans nichols", "nichols", "hans", "danny cevallos", "cevallos", "danny", "chuck todd", 
           "meet the press", "rick tyler", "tyler", "joe fryer", "rick", "joe scarborough", "scarborough", "joe", "natalie azar", "natalie", "susan del percio", 
           "del percio", "msliv", "james carville", "carville", "elise jordan", "yasmin vossoughian", "yasmin", "vossoughian", "politicsnation", "cori coffin", 
           "cori", "janell ross", "janell", "lindsey resier", "lindsey", "resier", "noah rothman", "rothman", "noah", "monica alba" , "alba", "monica", 
           "katie fang", "katie", "deepa shivaram", "deepa", "shivaram", "josh lederman", "lederman", "jeremy bash", "jeremy", "blayne alexander", "blayne", 
           "alexander", "morgan chesky", "morgan", "chesky", "courtney kube", "courtney", "kube", "benen", "peter baker", "peter", "andrew weissmann", "weissmann", 
           "andrew", "zeke emanuel", "zeke", "emanuel", "john torres", "john", "torres", "frank figliuzzi", "figliuzzi", "mcfaul", "ezekiel", "mcfaul", "beschloss", 
           "vin gupta", "gupta", "vin", "bill karins", "karins", "chuck rosenberg", "rosenberg", "arouzi", "philip rucker", "rucker", "philip", "leonnig", "trmads", 
           "eugene robinson", "eugene", "robinson", "engel", "robert costa", "costa", "robert", "lawrence odonnell", "odonnell", "lawrence", "jack jacobs", "jacobs", 
           "brett mcgurk", "mcgurk", "brett", "david corn", "david plouffe", "david", "plouffe", "brennan", "matthew sanderson", "sanderson", "matthew", 
           "vaughn hillyard", "hillyard", "vaughn", "jake ward", "mike memoli", "memoli", "victoria defrancesco soto", "annie karni", "karni", "annie", "11th hour", 
           "symone", "george stephanopoulos", "george", "stephanopoulos", "last word", "lwlod", "ruth marcus", "marcus", "ruth", "mieke eoyang", "eoyang", "mieke", 
           "malcolm nance", "nance", "malcolm", "jill wine banks", "wine banks", "wine banks", "jill", "ned price", "ned", "heilemann", "jennifer rubin", "rubin", 
           "jennifer", "wendy sherman", "sherman", "wendy", "hallie jackson", "jackson", "hallie", "paul butler", "paul", "stengel", "jason johnson", "jason", 
           "johnson", "yamiche alcindor", "alcindor", "yamiche", "ashish jha", "jha", "ashish", "larry", "maria teresa kumar", "maria", "teresa", "kumar", 
           "renee graham", "graham", "renee", "stuart stevens", "stuart", "stevens", "lipi roy", "roy", "lipi", "mike murphy", "murphy", "mike", "soborotf", 
           "eddie glaude", "glaude", "eddie", "anand giridharadas", "giridharadas", "anand", "heidi heitkamp", "heidi", "heitkamp", "errin haines", "haines", "errin", 
           "jelani cobb", "cobb", "jelani", "robin givhan", "robin", "givhan", "alex wagner", "wagner", "alex", "caroline", "randall", "jon meacham", "jon",
           "meacham", "ellison barber", "ellison", "jolly", "tiffany cross", "tiffany", "marianna sotomayor", "marianna", "ralston", "benjamin wittes", "wittes", 
           "benjamin", "gibbs", "katy tur", "tur", "katy", "patterson", "joseph fair", "joseph", "kavita patel", "kavita", "patel", "lester holt", "lester", "holt", 
           "jo ling kent", "jo ling", "kent", "vicky nguyen", "vicky", "nguyen", "tom costello", "tom", "costello", "gabe gutierrez", "gabe", "gutierrez", 
           "kate snow", "kate", "mcfadden", "irwin redlener", "irwin", "redlener", "hoda kotb", "hoda", "kotb", "keir simmons", "simmons", "keir", "savannah guthrie", 
           "savannah", "guthrie", "miguel almaguer", "miguel", "almaguer", "zimmern", "ken dilanian", "ken", "dilanian", "radford", "kelly cobiella", "kelly", 
           "cobiella", "charlie cook", "charlie savage", "charlie", "gadi schwartz", "gadi", "schwartz", "latosha brown", "latosha", "frum", "nicholas kristof", 
           "kristof", "nicholas", "schale")

# Fox-specific names/titles
fox <- c("dan bongino", "dan", "bongino", "karl rove", "karl", "rove", "mark levin", "levin", "gregg jarrett", "jarrett", "gregg", "tammy bruce", "bruce", 
         "tammy", "pete hegseth", "pete", "hegseth", "katie pavlich", "katie", "pavlich", "ari fleischer", "ari", "fleischer", "sarah sanders", "sanders",
         "sarah", "jonathan hunt", "jonathan turley", "jonathan", "turley", "mike huckabee", "mike", "huckabee", "geraldo rivera", "geraldo", "rivera", "newt gingrich", 
         "newt", "gingrich", "joe concha", "joe", "concha", "john solomon", "john", "solomon", "lawrence jones", "jones", "lawrence", "jason chaffetz", "jason", 
         "chaffetz", "sean hannity", "hannity", "sean", "ingraham angle", "laura ingraham", "laura", "ingraham", "benjamin hall", "benjamin", "sara carter", "sara", 
         "carter", "jesse watters", "watters", "jesse", "shan", "trace gallagher", "gallagher", "lara logan", "logan", "lara", "trey gowdy", "godwy", "trey", 
         "charlie hurt", "charlie", "kimberly strassel", "strassel", "kimberly", "david spunt", "david", "spunt", "lucas tomlinson", "lucas", "tomlinson", 
         "chad pergram", "chad", "pergram", "jeanine pirro", "pirro", "judge jeanine", "jeanine", "marianne rafferty", "rafferty", "marianne", "steve doocy", 
         "doocy", "steve", "jim gray", "jim", "jennifer griffin", "jennifer", "griffin", "yingst", "rick leventhal", "leventhal", "rick", "kevin corke", 
         "kevin", "corke", "bill hemmer", "hemmer", "martha maccallum", "martha", "maccallum", "bret baier", "bret", "baier", "leland vittert", "leland", 
         "vittert", "aisha hasnie", "aisha", "hasnie", "maria bartiromo", "maria", "bartiromo", "lisa boothe", "lisa", "boothe", "tobin", "rachel camposduffy", 
         "rachel", "camposduffy", "nicole saphier", "nicole", "saphier", "shannon bream", "shannon", "bream", "emily compagno", "compagno", "emily", "janette nesheiwat", 
         "janette", "nesheiwat", "webb", "bryan llenas", "bryan", "llenas", "deneen borelli", "deneen", "borelli", "joey", "campos", "duffy", "jacqui heinrich", 
         "jacqui", "heinrich", "mark meredith", "meredith", "david lee miller", "johnny", "susan li", "li", "susan", "alicia acuna", "alicia", "acuna", "ellison barber", 
         "ellison", "melissa francis", "melissa", "francis", "dan hoffman", "dan", "hoffman", "chris wallace", "chris", "wallace", "brit hume", "brit", "hume", 
         "juan williams", "the five", "williams", "juan", "dana perino", "perino", "dana", "jessica tarlov", "jessica", "tarlov", "marc thiessen", "thiessen",
         "stirewalt", "kristen soltis anderson", "soltis", "anderson", "marc siegel", "marc", "siegel", "tucker carlson tonight", "tucker carlson", "tucker", 
         "carlson", "ed henry", "ed", "henry", "courtney godfrey", "courtney", "godfrey", "dane placko", "placko", "dane", "jeff paul", "paul", "jeff", 
         "springer", "harrigan", "casey stegall", "stegall", "casey", "ted", "janice yu", "yu", "janice", "greg gutfeld", "gutfeld", "greg", "tia ewing", 
         "ewing", "tia", "matt finn", "finn", "tom shillue", "tom", "shillue", "harold ford", "harold", "andy mccarthy", "andy", "dagen mcdowell", "dagen", 
         "mcdowell", "lisa kennedy montgomery", "lisa", "marie harf", "marie", "harf", "kennedy montgomery", "foxfiv", "sandra smith", "smith", "sandra", 
         "brian kilmeade", "kilmeade", "brian", "kristin fisher", "fisher", "kristin", "rich edson", "edson", "richard fowler", "richard", "fowler", 
         "will cain", "cain", "neil cavuto", "neil", "cavuto", "duffy", "lauren blanchard", "lauren", "blanchard", "donna brazile", "donna", "brazile", "phil keating", 
         "phil", "keating", "ingle", "bill melugin", "melugin", "mark steyn", "steyn", "whitlock", "tcarl", "brett larson", "brett", "larson", "charles gasparino", 
         "charles", "gasparino", "nancy grace", "nancy", "walid phares", "phares", "walid", "gillian turner", "turner", "gillian", "dean", "rich reichmuth", 
         "reichmuth", "craig patrick", "craig", "patrick", "ashley strohmier", "ashley", "strohmier", "deroy murdock", "deroy", "murdock", "mollie hemingway", 
         "mollie", "hemingway", "raymond arroyo", "raymond", "arroyo", "sooji nam", "nam", "sooji", "lou dobbs", "dobbs", "lou", "andrew napolitano", "napolitano", 
         "andrew", "harris faulkner", "faulkner", "harris", "walsh", "byron york", "byron", "hart", "hillary vaughn", "vaughn", "michael goodwin", "goodwin", 
         "michael", "marty makary", "makary", "marty", "karen scullin", "scullin", "karen", "serrie", "oconnell", "simone del rosario", "simone", "del rosario",
         "adam klotz", "klotz", "adam", "carly shimkus", "carly", "robert ray", "robert guaderrama", "robert", "guaderrama", "randy", "emanuel", "sol wisenberg", 
         "sol", "wisenberg", "anita vogel", "vogel", "anita", "matt whitaker", "whitaker", "peter", "doug schoen", "doug", "schoen", "ingang", "joyce", 
         "dinesh dsouza", "dinesh", "dsouza", "harmeet dhillon", "dhillon", "harmeet", "james freeman", "james", "freeman", "hahn", "jackie ibanez", "jackie", 
         "ibanez", "candace owens", "owens", "candace")


# ~~~~~~~~~~


## Preprocess text

# initialize data frame for storing final proprocessed text
data_final <- data.frame()

# (1) ABC World News Tonight

# load data
setwd("../data/transcripts")
data <- read.csv("ABC World News Tonight.csv") %>% mutate(source = "ABC", program = "World News Tonight")

# replace invalid UTF-8 characters with a space
data$text <- stri_replace_all_regex(data$text, "[^[:print:]]", " ")

# remove `(MALE)` and `(FEMALE)`
data$text <- gsub(paste("\\b","(MALE)","\\b", sep = ""), " ", data$text)
data$text <- gsub(paste("\\b","(FEMALE)","\\b", sep = ""), " ", data$text)

# make everything lowercase
data$text <- tolower(data$text)

# remove punctuation and numbers
data$text <- gsub("[[:punct:]]", " ", data$text)
data$text <- gsub("[[:digit:]]", " ", data$text)

# remove broadcast-specific stopwords, source-specific names/titles, and common stopwords
for(i in 1:length(stopwords)){data$text <- gsub(paste("\\b",stopwords[i],"\\b", sep = ""), " ", data$text)}; rm(i)
for(i in 1:length(abc)){data$text <- gsub(paste("\\b",abc[i],"\\b", sep = ""), " ", data$text)}; rm(i, abc)
for(i in 1:length(common)){data$text <- gsub(paste("\\b",common[i],"\\b", sep = ""), " ", data$text)}; rm(i)

# append to final data
data_final <- bind_rows(data_final, data)


# (2) USA Today

# load data
data <- read.csv("USA Today.csv") %>% mutate(source = "USA Today", program = "USA Today")

# replace invalid UTF-8 characters with a space
data$text <- stri_replace_all_regex(data$text, "[^[:print:]]", " ")

# make everything lowercase
data$text <- tolower(data$text)

# remove punctuation and numbers
data$text <- gsub("[[:punct:]]", " ", data$text)
data$text <- gsub("[[:digit:]]", " ", data$text)

# remove source-specific names/titles and common stopwords
for(i in 1:length(usa)){data$text <- gsub(paste("\\b",usa[i],"\\b", sep = ""), " ", data$text)}; rm(i, usa)
for(i in 1:length(common)){data$text <- gsub(paste("\\b",common[i],"\\b", sep = ""), " ", data$text)}; rm(i)

# append to final data
data_final <- bind_rows(data_final, data)


# (3) MSNBC

# load data
df1 <- read.csv("All In with Chris Hayes.csv") %>% mutate(program = "All In with Chris Hayes")
df2 <- read.csv("PoliticsNation.csv") %>% mutate(program = "PoliticsNation")
df3 <- read.csv("The Last Word with Lawrence ODonnell.csv") %>% mutate(program = "The Last Word with Lawrence ODonnell")
df4 <- read.csv("The Rachel Maddow Show.csv") %>% mutate(program = "The Rachel Maddow Show")
data <- bind_rows(df1, df2, df3, df4); rm(df1, df2, df3, df4)
data <- data %>% mutate(source = "MSNBC")

# replace invalid UTF-8 characters with a space
data$text <- stri_replace_all_regex(data$text, "[^[:print:]]", " ")

# make everything lowercase
data$text <- tolower(data$text)

# remove punctuation and numbers
data$text <- gsub("[[:punct:]]", " ", data$text)
data$text <- gsub("[[:digit:]]", " ", data$text)

# remove broadcast-specific stopwords, source-specific names/titles, and common stopwords
for(i in 1:length(stopwords)){data$text <- gsub(paste("\\b",stopwords[i],"\\b", sep = ""), " ", data$text)}; rm(i)
for(i in 1:length(msnbc)){data$text <- gsub(paste("\\b",msnbc[i],"\\b", sep = ""), " ", data$text)}; rm(i, msnbc)
for(i in 1:length(common)){data$text <- gsub(paste("\\b",common[i],"\\b", sep = ""), " ", data$text)}; rm(i)

# append to final data
data_final <- bind_rows(data_final, data)


# (4) Fox

# load data
df1 <- read.csv("Hannity.csv") %>% mutate(program = "Hannity")
df2 <- read.csv("The Five.csv") %>% mutate(program = "The Five")
df3 <- read.csv("The Ingraham Angle.csv") %>% mutate(program = "The Ingraham Angle")
df4 <- read.csv("Tucker Carlson Tonight.csv") %>% mutate(program = "Tucker Carlson Tonight")
data <- bind_rows(df1, df2, df3, df4); rm(df1, df2, df3, df4)
data <- data %>% mutate(source = "Fox")

# replace invalid UTF-8 characters with a space
data$text <- stri_replace_all_regex(data$text, "[^[:print:]]", " ")

# make everything lowercase
data$text <- tolower(data$text)

# remove punctuation and numbers
data$text <- gsub("[[:punct:]]", " ", data$text)
data$text <- gsub("[[:digit:]]", " ", data$text)

# remove broadcast-specific stopwords, source-specific names/titles, and common stopwords
for(i in 1:length(stopwords)){data$text <- gsub(paste("\\b",stopwords[i],"\\b", sep = ""), " ", data$text)}; rm(i, stopwords)
for(i in 1:length(fox)){data$text <- gsub(paste("\\b",fox[i],"\\b", sep = "")," ", data$text)}; rm(i, fox)
for(i in 1:length(common)){data$text <- gsub(paste("\\b",common[i],"\\b", sep = ""), " ", data$text)}; rm(i, common)

# append to final data
data_final <- bind_rows(data_final, data); rm(data)


# (5) Full, merged dataset

# fix some typos
data_final$text <- gsub("thepfizer", "the pfizer", data_final$text)
data_final$text <- gsub("thefbi", "the fbi", data_final$text)
data_final$text <- gsub("thegolden", "the golden", data_final$text)
data_final$text <- gsub("thebrooklyn", "the brooklyn", data_final$text)
data_final$text <- gsub("thebrookings", "the brookings", data_final$text)
data_final$text <- gsub("thequds", "the quds", data_final$text)
data_final$text <- gsub("theunited", "the united", data_final$text)
data_final$text <- gsub("thegeneral", "the general", data_final$text)
data_final$text <- gsub("lindseygraham", "lindsey graham", data_final$text)
data_final$text <- gsub("theroger", "the roger", data_final$text)
data_final$text <- sub(paste("\\b", "inc", "\\b", sep = ""), " ", data_final$text)

# collapse text into one entry per transcript (i.e. by program-date)
data <- data_final %>% 
  group_by(date, source, program) %>% 
  summarize(body = paste(text, collapse = " ")) %>% 
  filter(body != "")

# remove extra white space
data$body <- gsub("\\s+", " ", data$body)
data$body <- trimws(data$body)

# save preprocessed text data to 'data/processed' folder
setwd("../processed")
saveRDS(data, "text_preprocessed.rds")

